'''
@-*- coding: utf-8 -*-
@ python:python 3.9
@ 创建人员:allen
@ 创建时间:2024/8/20
'''

# https://cq.lianjia.com/ershoufang/
# https://cq.lianjia.com/ershoufang/jiangbei/
# https://cq.lianjia.com/ershoufang/jiangbei/pg1/
# https://cq.lianjia.com/ershoufang/jiangbei/pg2/

import requests
import parsel
import re
import csv
import time

area = 'shapingba'
fileName = '沙坪坝'

f = open(f'./链家数据/链家二手房_重庆_{fileName}.csv', mode='w', encoding='utf-8', newline='')
csv_writer = csv.DictWriter(f, fieldnames=[
    '链接',
    '标题',
    '小区',
    '位置',
    '单价',
    '总价',
    '类型',
    '面积',
    '方位',
    '装修',
    '楼层',
    '楼层数',
    '建立时间',
    '信息',
    '关注人数',
    '发布时间',
    '标签',
    '推荐标签'
])

def getHouse(area,start,end):
    for page in range(start, end):
        # time.sleep(1)
        print(f'正在获取第{page}数据...')
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0'
        }
        url = f'https://cq.lianjia.com/ershoufang/{area}/pg{page}/'

        # response = requests.get(url=url, headers = headers)

        proxies = {"http":'27.192.173.108:9000'}

        response = requests.get(url=url, headers = headers, proxies=proxies,timeout=3)

        html_data = response.text

        selector = parsel.Selector(html_data)

        #根据标签来选择 CSS label
        divs = selector.css('.sellListContent li')
        print(divs)

        for div in divs:
            #链接
            href = div.css('a::attr(href)').extract_first()
            #标题
            title = div.css('.title a::text').get()
            #区域
            area_list = div.css('.positionInfo a::text').getall()

            if len(area_list) > 0:
                #小区名称
                area_pre = area_list[0]
                #小区位置
                area_suff = area_list[1]

            #总价
            if div.css('.totalPrice span::text').get() != None:
                #总价
                totalPrice = div.css('.totalPrice span::text').get()
                #单价
                unitPrice = div.css('.unitPrice span::text').get().replace('元/平','')
                #房屋信息
                houseInfo = div.css('.houseInfo::text').get()
                houseInfo = div.css('.houseInfo::text').get().split(' | ')
                houseType = houseInfo[0]
                houseArea = houseInfo[1].replace('平米','')
                houseFace = houseInfo[2]
                houseInner = houseInfo[3]
                houseFloor = houseInfo[4][0]
                houseFloor_num = re.findall('\d+', houseInfo[4])[0].replace(' ','')
                houseBuilding = houseInfo[-1]
                if len(houseInfo) == 7:
                    houseDate = houseInfo[5]
                else:
                    houseDate = '未知'
                #关注度
                followInfo = div.css('.followInfo ::text').get().split('/')
                followNum = followInfo[0]
                followDate = followInfo[1]
                #标签
                tags = div.css('.tag span::text').getall()
                #【必看好房】
                goodhouse_tag = div.css('.title span::text').get()

                dit = {
                    '链接' : href,
                    '标题' : title,
                    '小区' : area_pre,
                    '位置' : area_suff,
                    '单价' : unitPrice,
                    '总价' : totalPrice,
                    '类型': houseType,
                    '面积': houseArea,
                    '方位': houseFace,
                    '装修': houseInner,
                    '楼层': houseFloor,
                    '楼层数' : houseFloor_num,
                    '建立时间': houseDate,
                    '信息': houseBuilding,
                    '关注人数': followNum,
                    '发布时间': followDate,
                    '标签': tags,
                    '推荐标签': goodhouse_tag,
                }

                print(dit)
                csv_writer.writerow(dit)

def test():
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0'
    }
    url = 'https://cq.lianjia.com/ershoufang/jiangbei/pg1/'

    response = requests.get(url=url, headers=headers)

    html_data = response.text

    print(html_data)

if __name__ == '__main__':
    getHouse(area,1,100)
    # test()